{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "max_len = 100\n",
    "\n",
    "embed_size = 256\n",
    "encoder_num_banks = 16\n",
    "decoder_num_banks = 8\n",
    "num_highway_blocks = 4\n",
    "reduction_factor = 5\n",
    "\n",
    "learning_rate = 1e-4\n",
    "batch_size = 32\n",
    "\n",
    "from modules import (\n",
    "    conv1d,\n",
    "    normalize_in,\n",
    "    highwaynet,\n",
    "    gru,\n",
    "    attention_decoder,\n",
    "    prenet,\n",
    "    embed,\n",
    "    shift_by_one,\n",
    "    conv1d_banks,\n",
    ")\n",
    "import tensorflow as tf"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/husein/.local/lib/python3.6/site-packages/sklearn/cross_validation.py:41: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.\n",
      "  \"This module will be removed in 0.20.\", DeprecationWarning)\n"
     ]
    }
   ],
   "source": [
    "from utils import *\n",
    "from sklearn.cross_validation import train_test_split\n",
    "import time"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['negative', 'positive']\n",
      "10662\n",
      "10662\n"
     ]
    }
   ],
   "source": [
    "trainset = sklearn.datasets.load_files(container_path = 'data', encoding = 'UTF-8')\n",
    "trainset.data, trainset.target = separate_dataset(trainset,1.0)\n",
    "print (trainset.target_names)\n",
    "print (len(trainset.data))\n",
    "print (len(trainset.target))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "vocab from size: 20332\n",
      "Most common words [('film', 1453), ('movie', 1270), ('one', 727), ('like', 721), ('story', 477), ('much', 386)]\n",
      "Sample data [2723, 5347, 24, 35, 7006, 15, 64, 1951, 5348, 460] ['murderous', 'maids', 'may', 'well', 'comprehensive', 'films', 'also', 'strike', 'closest', 'truth']\n"
     ]
    }
   ],
   "source": [
    "concat = ' '.join(trainset.data).split()\n",
    "vocabulary_size = len(list(set(concat)))\n",
    "data, count, dictionary, rev_dictionary = build_dataset(concat, vocabulary_size)\n",
    "print('vocab from size: %d'%(vocabulary_size))\n",
    "print('Most common words', count[4:10])\n",
    "print('Sample data', data[:10], [rev_dictionary[i] for i in data[:10]])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "GO = dictionary['GO']\n",
    "PAD = dictionary['PAD']\n",
    "EOS = dictionary['EOS']\n",
    "UNK = dictionary['UNK']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def encode(inputs, is_training = True, scope = 'encoder', reuse = None):\n",
    "    with tf.variable_scope(scope, reuse = reuse):\n",
    "        prenet_out = prenet(inputs, scope = 'prenet', is_training = is_training)\n",
    "        enc = conv1d_banks(\n",
    "            prenet_out, K = encoder_num_banks, is_training = is_training\n",
    "        )\n",
    "        enc = tf.layers.max_pooling1d(enc, 2, 1, padding = 'same')\n",
    "        enc = conv1d(enc, embed_size // 2, 3, scope = 'conv1d_1')\n",
    "        enc = normalize_in(enc, activation_fn = tf.nn.relu)\n",
    "        enc = conv1d(enc, embed_size // 2, 3, scope = 'conv1d_2')\n",
    "        enc = normalize_in(enc, activation_fn = tf.nn.relu)\n",
    "        enc += prenet_out\n",
    "        for i in range(num_highway_blocks):\n",
    "            enc = highwaynet(\n",
    "                enc, units = embed_size // 2, scope = 'highwaynet_%d' % (i)\n",
    "            )\n",
    "        memory = gru(enc, embed_size // 2, True)\n",
    "    return memory\n",
    "\n",
    "\n",
    "def decode(\n",
    "    inputs, memory, output_size, is_training = True, scope = 'decoder_layers', reuse = None\n",
    "):\n",
    "    with tf.variable_scope(scope, reuse = reuse):\n",
    "        dec = prenet(inputs, is_training = is_training)\n",
    "        dec = attention_decoder(dec, memory, embed_size)\n",
    "        dec += gru(dec, embed_size, False, scope = 'gru1')\n",
    "        dec += gru(dec, embed_size, False, scope = 'gru2')\n",
    "        return tf.layers.dense(dec, output_size)\n",
    "\n",
    "class Model:\n",
    "    def __init__(self, dict_size, dimension_output):\n",
    "        self.X = tf.placeholder(tf.int32, [None, None])\n",
    "        self.Y = tf.placeholder(tf.int32, [None])\n",
    "        self.is_training = tf.placeholder(tf.bool, None)\n",
    "        \n",
    "        encoder_embeddings = tf.Variable(tf.random_uniform([dict_size, embed_size], -1, 1))\n",
    "        encoder_embedded = tf.nn.embedding_lookup(encoder_embeddings, self.X)\n",
    "        \n",
    "        self.decoder_inputs = embed(\n",
    "            shift_by_one(self.X), dict_size, embed_size\n",
    "        )\n",
    "        with tf.variable_scope('net'):\n",
    "            self.memory = encode(encoder_embedded, is_training = self.is_training)\n",
    "            self.outputs = decode(\n",
    "                self.decoder_inputs, self.memory, dimension_output, is_training = self.is_training\n",
    "            )\n",
    "        self.logits = self.outputs[:,-1]\n",
    "        self.cost = tf.reduce_mean(\n",
    "            tf.nn.sparse_softmax_cross_entropy_with_logits(\n",
    "                logits = self.logits, labels = self.Y\n",
    "            )\n",
    "        )\n",
    "        self.optimizer = tf.train.AdamOptimizer(\n",
    "            learning_rate = learning_rate\n",
    "        ).minimize(self.cost)\n",
    "        correct_pred = tf.equal(\n",
    "            tf.argmax(self.logits, 1, output_type = tf.int32), self.Y\n",
    "        )\n",
    "        self.accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "tf.reset_default_graph()\n",
    "sess = tf.InteractiveSession()\n",
    "model = Model(len(dictionary),len(trainset.target_names))\n",
    "sess.run(tf.global_variables_initializer())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "maxlen = 100\n",
    "batch_size = 16"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "vectors = str_idx(trainset.data,dictionary,maxlen)\n",
    "train_X, test_X, train_Y, test_Y = train_test_split(vectors, trainset.target,test_size = 0.2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "train minibatch loop: 100%|██████████| 534/534 [05:32<00:00,  1.39s/it, accuracy=1, cost=0.579]    \n",
      "test minibatch loop: 100%|██████████| 134/134 [00:39<00:00,  1.73it/s, accuracy=0.6, cost=0.69]   \n",
      "train minibatch loop:   0%|          | 0/534 [00:00<?, ?it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "epoch: 0, pass acc: 0.000000, current acc: 0.735865\n",
      "time taken: 371.9133610725403\n",
      "epoch: 0, training loss: 0.655707, training acc: 0.584594, valid loss: 0.544825, valid acc: 0.735865\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "train minibatch loop: 100%|██████████| 534/534 [05:23<00:00,  1.71it/s, accuracy=1, cost=0.169]     \n",
      "test minibatch loop: 100%|██████████| 134/134 [00:37<00:00,  3.60it/s, accuracy=0.6, cost=0.682]  \n",
      "train minibatch loop:   0%|          | 0/534 [00:00<?, ?it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "epoch: 1, pass acc: 0.735865, current acc: 0.762588\n",
      "time taken: 360.5551178455353\n",
      "epoch: 1, training loss: 0.360291, training acc: 0.842537, valid loss: 0.586421, valid acc: 0.762588\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "train minibatch loop: 100%|██████████| 534/534 [05:20<00:00,  1.73it/s, accuracy=1, cost=0.0132]    \n",
      "test minibatch loop: 100%|██████████| 134/134 [00:37<00:00,  3.62it/s, accuracy=0.6, cost=1.26]   \n",
      "train minibatch loop:   0%|          | 0/534 [00:00<?, ?it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "time taken: 358.11430191993713\n",
      "epoch: 2, training loss: 0.170106, training acc: 0.933990, valid loss: 0.819403, valid acc: 0.751336\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "train minibatch loop: 100%|██████████| 534/534 [05:20<00:00,  1.70it/s, accuracy=1, cost=0.00684]   \n",
      "test minibatch loop: 100%|██████████| 134/134 [00:37<00:00,  3.62it/s, accuracy=0.6, cost=1.66]   \n",
      "train minibatch loop:   0%|          | 0/534 [00:00<?, ?it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "time taken: 358.29570269584656\n",
      "epoch: 3, training loss: 0.082946, training acc: 0.971509, valid loss: 1.063742, valid acc: 0.747586\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "train minibatch loop: 100%|██████████| 534/534 [05:21<00:00,  1.72it/s, accuracy=1, cost=0.00145]   \n",
      "test minibatch loop:  90%|█████████ | 121/134 [00:33<00:03,  3.56it/s, accuracy=0.812, cost=1.47] "
     ]
    }
   ],
   "source": [
    "from tqdm import tqdm\n",
    "import time\n",
    "\n",
    "EARLY_STOPPING, CURRENT_CHECKPOINT, CURRENT_ACC, EPOCH = 3, 0, 0, 0\n",
    "\n",
    "while True:\n",
    "    lasttime = time.time()\n",
    "    if CURRENT_CHECKPOINT == EARLY_STOPPING:\n",
    "        print('break epoch:%d\\n' % (EPOCH))\n",
    "        break\n",
    "\n",
    "    train_acc, train_loss, test_acc, test_loss = 0, 0, 0, 0\n",
    "    pbar = tqdm(\n",
    "        range(0, len(train_X), batch_size), desc = 'train minibatch loop'\n",
    "    )\n",
    "    for i in pbar:\n",
    "        batch_x = train_X[i : min(i + batch_size, train_X.shape[0])]\n",
    "        batch_y = train_Y[i : min(i + batch_size, train_X.shape[0])]\n",
    "        acc, cost, _ = sess.run(\n",
    "            [model.accuracy, model.cost, model.optimizer],\n",
    "            feed_dict = {\n",
    "                model.Y: batch_y,\n",
    "                model.X: batch_x\n",
    "            },\n",
    "        )\n",
    "        assert not np.isnan(cost)\n",
    "        train_loss += cost\n",
    "        train_acc += acc\n",
    "        pbar.set_postfix(cost = cost, accuracy = acc)\n",
    "\n",
    "    pbar = tqdm(range(0, len(test_X), batch_size), desc = 'test minibatch loop')\n",
    "    for i in pbar:\n",
    "        batch_x = test_X[i : min(i + batch_size, test_X.shape[0])]\n",
    "        batch_y = test_Y[i : min(i + batch_size, test_X.shape[0])]\n",
    "        acc, cost = sess.run(\n",
    "            [model.accuracy, model.cost],\n",
    "            feed_dict = {\n",
    "                model.Y: batch_y,\n",
    "                model.X: batch_x\n",
    "            },\n",
    "        )\n",
    "        test_loss += cost\n",
    "        test_acc += acc\n",
    "        pbar.set_postfix(cost = cost, accuracy = acc)\n",
    "\n",
    "    train_loss /= len(train_X) / batch_size\n",
    "    train_acc /= len(train_X) / batch_size\n",
    "    test_loss /= len(test_X) / batch_size\n",
    "    test_acc /= len(test_X) / batch_size\n",
    "\n",
    "    if test_acc > CURRENT_ACC:\n",
    "        print(\n",
    "            'epoch: %d, pass acc: %f, current acc: %f'\n",
    "            % (EPOCH, CURRENT_ACC, test_acc)\n",
    "        )\n",
    "        CURRENT_ACC = test_acc\n",
    "        CURRENT_CHECKPOINT = 0\n",
    "    else:\n",
    "        CURRENT_CHECKPOINT += 1\n",
    "        \n",
    "    print('time taken:', time.time() - lasttime)\n",
    "    print(\n",
    "        'epoch: %d, training loss: %f, training acc: %f, valid loss: %f, valid acc: %f\\n'\n",
    "        % (EPOCH, train_loss, train_acc, test_loss, test_acc)\n",
    "    )\n",
    "    EPOCH += 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "validation minibatch loop: 100%|██████████| 134/134 [01:05<00:00,  2.07it/s]\n"
     ]
    }
   ],
   "source": [
    "real_Y, predict_Y = [], []\n",
    "\n",
    "pbar = tqdm(\n",
    "    range(0, len(test_X), batch_size), desc = 'validation minibatch loop'\n",
    ")\n",
    "for i in pbar:\n",
    "    batch_x = test_X[i : min(i + batch_size, test_X.shape[0])]\n",
    "    batch_y = test_Y[i : min(i + batch_size, test_X.shape[0])]\n",
    "    predict_Y += np.argmax(\n",
    "        sess.run(\n",
    "            model.logits, feed_dict = {model.X: batch_x, model.Y: batch_y}\n",
    "        ),\n",
    "        1,\n",
    "    ).tolist()\n",
    "    real_Y += batch_y"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "             precision    recall  f1-score   support\n",
      "\n",
      "   negative       0.73      0.77      0.75      1073\n",
      "   positive       0.75      0.71      0.73      1060\n",
      "\n",
      "avg / total       0.74      0.74      0.74      2133\n",
      "\n"
     ]
    }
   ],
   "source": [
    "print(metrics.classification_report(real_Y, predict_Y, target_names = trainset.target_names))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
